FIFA 22 PLAYERS ANALYSIS
Data and exploration
Data
fifa <- read.csv("players_22.csv")
#head(fifa, 3)
Missing values
missing.values <- fifa %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num.missing = n()) %>%
filter(is.missing == T) %>%
select(-is.missing) %>%
arrange(desc(num.missing))
missing.values %>%
ggplot() +
geom_bar(aes(x = key, y = num.missing), stat = "identity") +
labs(x = "variable", y = "number of missing values", title = "Number of missing values") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

Duplicates
sprintf("Number of duplicates columns:")
## [1] "Number of duplicates columns:"
sum(duplicated(fifa$long_name) == TRUE)
## [1] 20
sprintf("Dimension before")
## [1] "Dimension before"
dim(fifa)
## [1] 19239 110
fifa <- fifa %>% distinct(long_name, .keep_all = TRUE)
sprintf("Dimension after")
## [1] "Dimension after"
dim(fifa)
## [1] 19219 110
Data Cleaning
fifa <- fifa %>%
mutate(wage_eur = ifelse(is.na(wage_eur),
median(wage_eur, na.rm = T),
wage_eur
)) %>%
mutate(value_eur = ifelse(is.na(value_eur),
median(value_eur, na.rm = T),
value_eur
)) %>%
mutate(passing = ifelse(is.na(passing),
median(passing, na.rm = T),
passing
)) %>%
mutate(dribbling = ifelse(is.na(dribbling),
median(dribbling, na.rm = T),
dribbling
)) %>%
mutate(pace = ifelse(is.na(pace),
median(pace, na.rm = T),
pace
)) %>%
mutate(defending = ifelse(is.na(defending),
median(defending, na.rm = T),
defending
)) %>%
mutate(shooting = ifelse(is.na(shooting),
median(shooting, na.rm = T),
shooting
)) %>%
mutate(physic = ifelse(is.na(physic),
median(physic, na.rm = T),
physic
))
Visaulizing Amounts
league1 <- table(fifa$league_name)
league2=as.data.frame(league1)
league3 <- league2 %>%
mutate(Freq = as.numeric(Freq)) %>%
arrange(desc(Freq)) %>%
slice(1:5)
par(las=2)
par(mar=c(5,10,4,2))
barplot(league3$Freq,names.arg=c("USA Major League Soccer", "Argentina Primera División",
"English League Championship", "English Premier League", "Spain Primera Division"), cex.names=0.8,xlab="Number of Players",col="blue",
main="Top 5 Leagues with Highest Number of Players",border="red", horiz=TRUE)

Where are the players from?
fifa_country_count <- fifa %>%
group_by(nationality_name) %>%
summarize(Freq = n())
fifa_country_count$nationality_name[fifa_country_count$nationality_name
== "United States"] <- "United States of America"
fifa_country_count
## # A tibble: 163 × 2
## nationality_name Freq
## <chr> <int>
## 1 Afghanistan 1
## 2 Albania 46
## 3 Algeria 51
## 4 Andorra 1
## 5 Angola 17
## 6 Antigua and Barbuda 3
## 7 Argentina 960
## 8 Armenia 7
## 9 Australia 266
## 10 Austria 318
## # … with 153 more rows
library(plotly)
country_with_code <- read.csv("https://raw.githubusercontent.com/lukes/ISO-3166-Countries-with-Regional-Codes/master/slim-3/slim-3.csv")
head(country_with_code)
## name alpha.3 country.code
## 1 Afghanistan AFG 4
## 2 Ã…land Islands ALA 248
## 3 Albania ALB 8
## 4 Algeria DZA 12
## 5 American Samoa ASM 16
## 6 Andorra AND 20
fifa_country_count_with_code <- fifa_country_count %>%
left_join(country_with_code,
by = c("nationality_name" = "name")
)
fig <- plot_ly(fifa_country_count_with_code,
type = "choropleth",
locations = fifa_country_count_with_code$alpha.3,
z = fifa_country_count_with_code$Freq,
text = fifa_country_count_with_code$nationality_name,
colorscale = "ice"
)
fig <- fig %>% colorbar(title = "No of players")
fig <- fig %>% layout(
title = "Choropleth showing FIFA 2022 Players' Nationality <br>(Hover for breakdown)"
)
fig
Visaulizing Proportions
nation1 <- table(fifa$nationality_name)
nation2=as.data.frame(nation1)
nation3 <- nation2 %>%
mutate(Freq = as.numeric(Freq)) %>%
arrange(desc(Freq)) %>%
slice(1:5)
pct <- round(nation3$Freq/sum(nation3$Freq)*100)
nation3$Var1 <- paste(pct,"%",sep="")
colors <- c("grey", "blue","green", "yellow", "red")
pie(nation3$Freq,labels = nation3$Var1, col = colors,
main="Top 5 Countries with Highest Number of Players", cex=0.8)
legend(1.2, .5, c("England", "Germany", "Spain", "France", "Argentina"), fill = colors)

Age Distribution
ggplot(fifa, aes(x = age)) +
geom_boxplot() +
labs(
title = "The Distribution of Age",
x = "Age"
)

Oldest Players
old <- fifa %>%
arrange(desc(age)) %>%
select(short_name, nationality_name, age)
head(old)
## short_name nationality_name age
## 1 K. Miura Japan 54
## 2 G. Buffon Italy 43
## 3 C. Lucchetti Argentina 43
## 4 S. Nakamura Japan 43
## 5 D. Vaca Bolivia 42
## 6 K. Ellison England 42
Youngest Players
young <- fifa %>%
arrange(age) %>%
select(short_name, nationality_name, age)
head(young)
## short_name nationality_name age
## 1 Gavi Spain 16
## 2 V. Barco Argentina 16
## 3 A. Kalogeropoulos Greece 16
## 4 Yayo Spain 16
## 5 R. van den Berg Netherlands 16
## 6 A. Musi Romania 16
Some line charts with age
Players Attributes vs. Age
filtered_attributes <- fifa %>%
group_by(age) %>%
summarise_at(
vars(overall, potential, pace),
list(mean)
)
filtered_attributes
## # A tibble: 29 × 4
## age overall potential pace
## <int> <dbl> <dbl> <dbl>
## 1 16 55.8 75.9 68.5
## 2 17 56.6 74.8 67.9
## 3 18 58.0 74.1 68.5
## 4 19 59.2 73.3 68.3
## 5 20 60.7 72.5 68.9
## 6 21 63.2 72.7 69.4
## 7 22 64.0 72.4 70.0
## 8 23 65.7 72.5 70.5
## 9 24 66.6 71.9 70.5
## 10 25 67.6 71.1 70.3
## # … with 19 more rows
library("reshape2")
data_long <- melt(filtered_attributes, id = "age")
ggplot(
data_long,
aes(
x = age,
y = value,
color = variable
)
) +
geom_line() +
labs(
y = "Player Atrributes Rating",
x = "Age",
color = "Color Legend",
title = "Attributes vs Age",
)

Players Value vs. Age
money_attributes <- fifa %>%
group_by(age) %>%
summarise_at(
vars(value_eur),
list(mean)
)
ggplot(money_attributes, aes(x = age, y = value_eur)) +
geom_line() +
labs(
y = "Player Value in Millions",
x = "Age",
title = "Player value vs Age",
) +
scale_y_continuous(labels = label_number(suffix = " M", scale = 1e-6)) # thousands

Modeling
hist(fifa$overall, xlab="overall Rating", main = "Histogram of Overall Rating")

fifa$overall = log10(fifa$overall)
hist(fifa$overall, xlab = "Log10 of Overall Rating", main = "Histogram of Overall Rating- Log10")

hist(fifa$wage_eur, xlab = "Wage in Euros", main = "Histogram of Wage")

fifa$wage_eur = log10(fifa$wage_eur)
#fifa$wage_eur = log(fifa$overall)
hist(fifa$overall, xlab="Wage in Euros", main="Histogram of Wage- Log10")

Train / test
dt <- sort(sample(nrow(fifa), nrow(fifa) * .7))
train <- fifa[dt, ]
test <- fifa[-dt, ]
train <- train %>% filter(!is.na(overall))
test <- test %>% filter(!is.na(overall))
overall_individualskills <- lm(overall ~ pace + shooting + passing + dribbling + defending +
physic + age + preferred_foot + attacking_crossing + attacking_finishing + attacking_heading_accuracy +
attacking_short_passing + attacking_volleys + skill_dribbling +skill_curve +skill_fk_accuracy +
skill_long_passing +skill_ball_control +movement_acceleration +movement_sprint_speed +
movement_agility +movement_reactions +movement_balance +power_shot_power +power_jumping+ power_stamina +
power_strength +power_long_shots +mentality_aggression +mentality_interceptions +mentality_positioning +
mentality_vision +mentality_penalties +mentality_composure +defending_marking_awareness +defending_standing_tackle +
defending_sliding_tackle, data = train)
summary(overall_individualskills)
##
## Call:
## lm(formula = overall ~ pace + shooting + passing + dribbling +
## defending + physic + age + preferred_foot + attacking_crossing +
## attacking_finishing + attacking_heading_accuracy + attacking_short_passing +
## attacking_volleys + skill_dribbling + skill_curve + skill_fk_accuracy +
## skill_long_passing + skill_ball_control + movement_acceleration +
## movement_sprint_speed + movement_agility + movement_reactions +
## movement_balance + power_shot_power + power_jumping + power_stamina +
## power_strength + power_long_shots + mentality_aggression +
## mentality_interceptions + mentality_positioning + mentality_vision +
## mentality_penalties + mentality_composure + defending_marking_awareness +
## defending_standing_tackle + defending_sliding_tackle, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.097935 -0.009787 0.000010 0.010223 0.094991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.437e+00 2.140e-03 671.663 < 2e-16 ***
## pace 8.007e-05 5.697e-05 1.405 0.159924
## shooting -2.228e-03 1.882e-04 -11.840 < 2e-16 ***
## passing 1.869e-04 1.093e-04 1.711 0.087173 .
## dribbling 1.983e-03 1.562e-04 12.692 < 2e-16 ***
## defending 2.351e-03 1.562e-04 15.053 < 2e-16 ***
## physic -2.428e-04 7.689e-05 -3.159 0.001589 **
## age 7.849e-04 3.976e-05 19.742 < 2e-16 ***
## preferred_footRight -2.582e-03 3.533e-04 -7.307 2.88e-13 ***
## attacking_crossing 2.026e-04 2.981e-05 6.795 1.13e-11 ***
## attacking_finishing 1.120e-03 8.725e-05 12.840 < 2e-16 ***
## attacking_heading_accuracy 1.487e-04 2.475e-05 6.009 1.91e-09 ***
## attacking_short_passing 5.811e-04 4.706e-05 12.348 < 2e-16 ***
## attacking_volleys 1.369e-05 2.424e-05 0.565 0.572090
## skill_dribbling -6.984e-04 7.608e-05 -9.179 < 2e-16 ***
## skill_curve -9.598e-05 2.257e-05 -4.252 2.13e-05 ***
## skill_fk_accuracy -2.607e-05 2.008e-05 -1.299 0.194121
## skill_long_passing -2.347e-04 3.135e-05 -7.486 7.52e-14 ***
## skill_ball_control 4.855e-04 5.558e-05 8.737 < 2e-16 ***
## movement_acceleration 2.942e-04 3.644e-05 8.073 7.45e-16 ***
## movement_sprint_speed 1.676e-04 3.890e-05 4.309 1.65e-05 ***
## movement_agility -1.655e-04 2.574e-05 -6.430 1.32e-10 ***
## movement_reactions 2.153e-03 2.971e-05 72.450 < 2e-16 ***
## movement_balance -2.470e-04 2.066e-05 -11.953 < 2e-16 ***
## power_shot_power 8.340e-04 4.436e-05 18.800 < 2e-16 ***
## power_jumping 5.397e-05 1.613e-05 3.345 0.000824 ***
## power_stamina 2.120e-04 2.617e-05 8.100 5.97e-16 ***
## power_strength 3.443e-04 3.682e-05 9.351 < 2e-16 ***
## power_long_shots 1.579e-04 4.470e-05 3.533 0.000412 ***
## mentality_aggression -3.608e-05 2.295e-05 -1.572 0.115967
## mentality_interceptions -6.039e-04 4.110e-05 -14.696 < 2e-16 ***
## mentality_positioning -3.453e-04 2.706e-05 -12.761 < 2e-16 ***
## mentality_vision -7.748e-05 2.741e-05 -2.826 0.004718 **
## mentality_penalties 1.295e-04 2.286e-05 5.663 1.52e-08 ***
## mentality_composure 5.667e-04 2.495e-05 22.719 < 2e-16 ***
## defending_marking_awareness -4.961e-04 5.006e-05 -9.911 < 2e-16 ***
## defending_standing_tackle -5.847e-04 5.939e-05 -9.845 < 2e-16 ***
## defending_sliding_tackle -2.556e-04 3.922e-05 -6.516 7.47e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01663 on 13415 degrees of freedom
## Multiple R-squared: 0.8685, Adjusted R-squared: 0.8682
## F-statistic: 2395 on 37 and 13415 DF, p-value: < 2.2e-16
wage_individualskills <- lm(wage_eur ~ pace + shooting + passing + dribbling + defending +
physic + age + preferred_foot + attacking_crossing + attacking_finishing + attacking_heading_accuracy +
attacking_short_passing + attacking_volleys + skill_dribbling +skill_curve +skill_fk_accuracy +
skill_long_passing +skill_ball_control +movement_acceleration +movement_sprint_speed +movement_agility +
movement_reactions +movement_balance +power_shot_power +power_jumping+ power_stamina +
power_strength +power_long_shots +mentality_aggression +mentality_interceptions +mentality_positioning +
mentality_vision +mentality_penalties +mentality_composure +defending_marking_awareness +defending_standing_tackle +
defending_sliding_tackle, data = train)
summary(wage_individualskills)
##
## Call:
## lm(formula = wage_eur ~ pace + shooting + passing + dribbling +
## defending + physic + age + preferred_foot + attacking_crossing +
## attacking_finishing + attacking_heading_accuracy + attacking_short_passing +
## attacking_volleys + skill_dribbling + skill_curve + skill_fk_accuracy +
## skill_long_passing + skill_ball_control + movement_acceleration +
## movement_sprint_speed + movement_agility + movement_reactions +
## movement_balance + power_shot_power + power_jumping + power_stamina +
## power_strength + power_long_shots + mentality_aggression +
## mentality_interceptions + mentality_positioning + mentality_vision +
## mentality_penalties + mentality_composure + defending_marking_awareness +
## defending_standing_tackle + defending_sliding_tackle, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.6740 -0.2350 0.0160 0.2624 1.3419
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.1012120 0.0495589 -2.042 0.041145 *
## pace -0.0010307 0.0013196 -0.781 0.434752
## shooting -0.0173958 0.0043590 -3.991 6.62e-05 ***
## passing -0.0018153 0.0025307 -0.717 0.473201
## dribbling 0.0176433 0.0036188 4.875 1.10e-06 ***
## defending 0.0266347 0.0036178 7.362 1.92e-13 ***
## physic -0.0027722 0.0017809 -1.557 0.119593
## age -0.0093548 0.0009210 -10.158 < 2e-16 ***
## preferred_footRight -0.0123259 0.0081832 -1.506 0.132030
## attacking_crossing 0.0026669 0.0006906 3.862 0.000113 ***
## attacking_finishing 0.0077875 0.0020210 3.853 0.000117 ***
## attacking_heading_accuracy 0.0040710 0.0005732 7.102 1.29e-12 ***
## attacking_short_passing 0.0022091 0.0010900 2.027 0.042716 *
## attacking_volleys 0.0023400 0.0005614 4.168 3.09e-05 ***
## skill_dribbling -0.0028624 0.0017622 -1.624 0.104324
## skill_curve 0.0020949 0.0005228 4.007 6.19e-05 ***
## skill_fk_accuracy -0.0013095 0.0004650 -2.816 0.004868 **
## skill_long_passing -0.0008253 0.0007262 -1.136 0.255807
## skill_ball_control 0.0015643 0.0012873 1.215 0.224301
## movement_acceleration 0.0016593 0.0008441 1.966 0.049339 *
## movement_sprint_speed 0.0035286 0.0009010 3.916 9.04e-05 ***
## movement_agility -0.0011727 0.0005963 -1.967 0.049233 *
## movement_reactions 0.0233212 0.0006883 33.883 < 2e-16 ***
## movement_balance -0.0016850 0.0004786 -3.521 0.000431 ***
## power_shot_power 0.0091222 0.0010275 8.878 < 2e-16 ***
## power_jumping 0.0010828 0.0003737 2.898 0.003765 **
## power_stamina -0.0011059 0.0006062 -1.824 0.068130 .
## power_strength 0.0025534 0.0008529 2.994 0.002760 **
## power_long_shots -0.0000397 0.0010354 -0.038 0.969411
## mentality_aggression 0.0018055 0.0005316 3.396 0.000685 ***
## mentality_interceptions -0.0073270 0.0009519 -7.697 1.49e-14 ***
## mentality_positioning -0.0035675 0.0006268 -5.692 1.28e-08 ***
## mentality_vision 0.0004527 0.0006350 0.713 0.475894
## mentality_penalties 0.0012220 0.0005295 2.308 0.021023 *
## mentality_composure 0.0086118 0.0005778 14.905 < 2e-16 ***
## defending_marking_awareness -0.0055073 0.0011595 -4.750 2.06e-06 ***
## defending_standing_tackle -0.0065354 0.0013756 -4.751 2.05e-06 ***
## defending_sliding_tackle -0.0022125 0.0009084 -2.436 0.014879 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3852 on 13415 degrees of freedom
## Multiple R-squared: 0.5699, Adjusted R-squared: 0.5688
## F-statistic: 480.5 on 37 and 13415 DF, p-value: < 2.2e-16
overalldata <- data.frame(actual= test$overall, predicted = predict(overall_individualskills, test))
wagedata <- data.frame(actual= test$wage_eur, predicted = predict(wage_individualskills, test))
#colSums(is.na(data2))
m2wage <- mean((wagedata$actual- wagedata$predicted)^2)
m2wage
## [1] 0.1543064
m2overall <- mean((overalldata$actual - overalldata$predicted)^2)
m2overall
## [1] 0.0002738323
ggplot(wagedata, aes(x = predicted, y = actual)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
labs(x='Transformed Predicted Wage Values', y='Transformed Actual Wage Values',
title='Transformed Predicted vs. Transformed Actual Wage Values')

ggplot(overalldata, aes(x = predicted, y = actual)) +
geom_point() +
geom_abline(intercept = 0, slope = 1) +
labs(x='Transformed Predicted Overall Values', y='Transformed Actual Overall Values',
title='Transformed Predicted vs. Transformed Actual Overall Values')

Ghana players cluster
# filter out players from team Ghana who arenot reserves or substitutes
ghana <- fifa %>%
filter(fifa$nationality_name == "Ghana" & fifa$club_position != "RES" &
fifa$club_position != "SUB") %>%
select(short_name, shooting, pace, passing, dribbling, defending)
# drop duplicates if any
ghana <- ghana %>% distinct(short_name, .keep_all = TRUE)
# make player name as rownames
rownames(ghana) <- ghana$short_name
ghana2 <- ghana %>% select(2:5)
# perform k means clustering with 10 centers
k2 <- kmeans(ghana2, centers = 10, nstart = 25)
# str(k2)
fviz_cluster(k2, data = ghana2)

Current Portugal Team Cluster
portugal_current <- c(
"André Silva", "Antonio Silva", "Bernardo Silva", "Bruno Fernandes",
"Cristiano Ronaldo", "Danilo Pereira", "Diogo Costa", "Diogo Dalot",
"Gonçalo Ramos", "João Cancelo", "João Félix", "João Mário", "Palhinha",
"José Sá", "Matheus Nunes", "Nuno Mendes", "Otavio", "Pepe", "Rafael Leão",
"Raphael Guzzo", "Ricardo Horta", "Rúben Dias", "Ruben Neves", "Rui PatrÃcio",
"Vitinha", "William Carvalho"
)
portugal <- fifa %>%
filter(fifa$nationality_name == "Portugal") %>%
select(short_name, shooting, pace, passing, dribbling, defending)
# filter out the players in current national team
portugal <- portugal[portugal$short_name %in% portugal_current, ]
# drop duplicates if any
portugal <- portugal %>% distinct(short_name, .keep_all = TRUE)
# make player name as rownames
rownames(portugal) <- portugal$short_name
portugal2 <- portugal %>% select(2:5)
# perform k means clustering with 5 centers
k2 <- kmeans(portugal2, centers = 5, nstart = 25)
# str(k2)
fviz_cluster(k2, data = portugal2)

res <- hcut(portugal2, k = 4, stand = TRUE)
# Visualize
fviz_dend(res, rect = TRUE, cex = 0.5,
k_colors = c("#00AFBB","#2E9FDF", "#E7B800", "#FC4E07"))
